Data Exploration with Pandas


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/titanic-train.csv')

In [3]:
type(df)


Out[3]:
pandas.core.frame.DataFrame

In [4]:
df.head()


Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [5]:
# Getting info about the DataFrame
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

In [6]:
# Getting basic information about each column in the DataFrame
df.describe()


Out[6]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

Indexing


In [7]:
df.iloc[3]


Out[7]:
PassengerId                                               4
Survived                                                  1
Pclass                                                    1
Name           Futrelle, Mrs. Jacques Heath (Lily May Peel)
Sex                                                  female
Age                                                      35
SibSp                                                     1
Parch                                                     0
Ticket                                               113803
Fare                                                   53.1
Cabin                                                  C123
Embarked                                                  S
Name: 3, dtype: object

In [8]:
df.loc[0:4,'Ticket']


Out[8]:
0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object

In [9]:
df['Ticket'].head()


Out[9]:
0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object

In [10]:
df[['Embarked', 'Ticket']].head()


Out[10]:
Embarked Ticket
0 S A/5 21171
1 C PC 17599
2 S STON/O2. 3101282
3 S 113803
4 S 373450

Selections


In [11]:
# Selecting part of the DataFrame where value in the column 'Age' > 70
df[df['Age'] > 70]


Out[11]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
96 97 0 1 Goldschmidt, Mr. George B male 71.0 0 0 PC 17754 34.6542 A5 C
116 117 0 3 Connors, Mr. Patrick male 70.5 0 0 370369 7.7500 NaN Q
493 494 0 1 Artagaveytia, Mr. Ramon male 71.0 0 0 PC 17609 49.5042 NaN C
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson male 80.0 0 0 27042 30.0000 A23 S
851 852 0 3 Svensson, Mr. Johan male 74.0 0 0 347060 7.7750 NaN S

In [12]:
df['Age'] > 70


Out[12]:
0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
861    False
862    False
863    False
864    False
865    False
866    False
867    False
868    False
869    False
870    False
871    False
872    False
873    False
874    False
875    False
876    False
877    False
878    False
879    False
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool

In [13]:
df.query("Age > 70")


Out[13]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
96 97 0 1 Goldschmidt, Mr. George B male 71.0 0 0 PC 17754 34.6542 A5 C
116 117 0 3 Connors, Mr. Patrick male 70.5 0 0 370369 7.7500 NaN Q
493 494 0 1 Artagaveytia, Mr. Ramon male 71.0 0 0 PC 17609 49.5042 NaN C
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson male 80.0 0 0 27042 30.0000 A23 S
851 852 0 3 Svensson, Mr. Johan male 74.0 0 0 347060 7.7750 NaN S

In [14]:
df[(df['Age'] == 11) & (df['SibSp'] == 5)]


Out[14]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
59 60 0 3 Goodwin, Master. William Frederick male 11.0 5 2 CA 2144 46.9 NaN S

In [15]:
df[(df.Age == 11) | (df.SibSp == 5)]


Out[15]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
59 60 0 3 Goodwin, Master. William Frederick male 11.0 5 2 CA 2144 46.9000 NaN S
71 72 0 3 Goodwin, Miss. Lillian Amy female 16.0 5 2 CA 2144 46.9000 NaN S
386 387 0 3 Goodwin, Master. Sidney Leonard male 1.0 5 2 CA 2144 46.9000 NaN S
480 481 0 3 Goodwin, Master. Harold Victor male 9.0 5 2 CA 2144 46.9000 NaN S
542 543 0 3 Andersson, Miss. Sigrid Elisabeth female 11.0 4 2 347082 31.2750 NaN S
683 684 0 3 Goodwin, Mr. Charles Edward male 14.0 5 2 CA 2144 46.9000 NaN S
731 732 0 3 Hassan, Mr. Houssein G N male 11.0 0 0 2699 18.7875 NaN C
802 803 1 1 Carter, Master. William Thornton II male 11.0 1 2 113760 120.0000 B96 B98 S

In [16]:
df.query('(Age == 11) | (SibSp == 5)')


Out[16]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
59 60 0 3 Goodwin, Master. William Frederick male 11.0 5 2 CA 2144 46.9000 NaN S
71 72 0 3 Goodwin, Miss. Lillian Amy female 16.0 5 2 CA 2144 46.9000 NaN S
386 387 0 3 Goodwin, Master. Sidney Leonard male 1.0 5 2 CA 2144 46.9000 NaN S
480 481 0 3 Goodwin, Master. Harold Victor male 9.0 5 2 CA 2144 46.9000 NaN S
542 543 0 3 Andersson, Miss. Sigrid Elisabeth female 11.0 4 2 347082 31.2750 NaN S
683 684 0 3 Goodwin, Mr. Charles Edward male 14.0 5 2 CA 2144 46.9000 NaN S
731 732 0 3 Hassan, Mr. Houssein G N male 11.0 0 0 2699 18.7875 NaN C
802 803 1 1 Carter, Master. William Thornton II male 11.0 1 2 113760 120.0000 B96 B98 S

Unique Values


In [17]:
df['Embarked'].unique()


Out[17]:
array(['S', 'C', 'Q', nan], dtype=object)

Sorting


In [18]:
# Sorting descending by column 'Age'
df.sort_values('Age', ascending = False).head()


Out[18]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson male 80.0 0 0 27042 30.0000 A23 S
851 852 0 3 Svensson, Mr. Johan male 74.0 0 0 347060 7.7750 NaN S
493 494 0 1 Artagaveytia, Mr. Ramon male 71.0 0 0 PC 17609 49.5042 NaN C
96 97 0 1 Goldschmidt, Mr. George B male 71.0 0 0 PC 17754 34.6542 A5 C
116 117 0 3 Connors, Mr. Patrick male 70.5 0 0 370369 7.7500 NaN Q

In [19]:
# Sorting ascending by column 'Age'
df.sort_values('Age', ascending = True).head()


Out[19]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
803 804 1 3 Thomas, Master. Assad Alexander male 0.42 0 1 2625 8.5167 NaN C
755 756 1 2 Hamalainen, Master. Viljo male 0.67 1 1 250649 14.5000 NaN S
644 645 1 3 Baclini, Miss. Eugenie female 0.75 2 1 2666 19.2583 NaN C
469 470 1 3 Baclini, Miss. Helene Barbara female 0.75 2 1 2666 19.2583 NaN C
78 79 1 2 Caldwell, Master. Alden Gates male 0.83 0 2 248738 29.0000 NaN S

Aggregations


In [20]:
# Counting representation in the column
df['Survived'].value_counts()


Out[20]:
0    549
1    342
Name: Survived, dtype: int64

In [21]:
df['Pclass'].value_counts()


Out[21]:
3    491
1    216
2    184
Name: Pclass, dtype: int64

In [22]:
df.groupby(['Pclass', 'Survived'])['PassengerId'].count()


Out[22]:
Pclass  Survived
1       0            80
        1           136
2       0            97
        1            87
3       0           372
        1           119
Name: PassengerId, dtype: int64

In [23]:
# Min
df['Age'].min()


Out[23]:
0.41999999999999998

In [24]:
# Max
df['Age'].max()


Out[24]:
80.0

In [25]:
# Mean
df['Age'].mean()


Out[25]:
29.69911764705882

In [26]:
# Median
df['Age'].median()


Out[26]:
28.0

In [27]:
mean_age_by_survived = df.groupby('Survived')['Age'].mean()
mean_age_by_survived


Out[27]:
Survived
0    30.626179
1    28.343690
Name: Age, dtype: float64

In [28]:
std_age_by_survived = df.groupby('Survived')['Age'].std()
std_age_by_survived


Out[28]:
Survived
0    14.172110
1    14.950952
Name: Age, dtype: float64

Merge


In [29]:
df1 = mean_age_by_survived.round(0).reset_index()
df2 = std_age_by_survived.round(0).reset_index()

In [30]:
df1


Out[30]:
Survived Age
0 0 31.0
1 1 28.0

In [31]:
df2


Out[31]:
Survived Age
0 0 14.0
1 1 15.0

In [32]:
df3 = pd.merge(df1, df2, on = 'Survived')

In [33]:
df3


Out[33]:
Survived Age_x Age_y
0 0 31.0 14.0
1 1 28.0 15.0

In [34]:
# Giving names to the columns
df3.columns = ['Survived', 'Average Age', 'Age Standard Deviation']

In [35]:
df3


Out[35]:
Survived Average Age Age Standard Deviation
0 0 31.0 14.0
1 1 28.0 15.0

Pivot Tables


In [36]:
# Creating a pivot table
df.pivot_table(index = 'Pclass',
               columns = 'Survived',
               values = 'PassengerId',
               aggfunc = 'count')


Out[36]:
Survived 0 1
Pclass
1 80 136
2 97 87
3 372 119

Correlations


In [37]:
df['IsFemale'] = (df['Sex'] == 'female')

In [38]:
df['IsFemale'].head()


Out[38]:
0    False
1     True
2     True
3     True
4    False
Name: IsFemale, dtype: bool

In [39]:
correlated_with_survived = df.corr()['Survived'].sort_values()
correlated_with_survived


Out[39]:
Pclass        -0.338481
Age           -0.077221
SibSp         -0.035322
PassengerId   -0.005007
Parch          0.081629
Fare           0.257307
IsFemale       0.543351
Survived       1.000000
Name: Survived, dtype: float64

In [40]:
%matplotlib inline

In [41]:
# Plotting correlation with column 'Survived'
# Last column is omitted, because correlation between column 'Survived' and column 'Survived' is 1.
correlated_with_survived.iloc[:-1].plot(kind='bar',
                                        title='Titanic Passengers: correlation with survival')


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f61513a2710>

In [42]:
df.corr()


Out[42]:
PassengerId Survived Pclass Age SibSp Parch Fare IsFemale
PassengerId 1.000000 -0.005007 -0.035144 0.036847 -0.057527 -0.001652 0.012658 -0.042939
Survived -0.005007 1.000000 -0.338481 -0.077221 -0.035322 0.081629 0.257307 0.543351
Pclass -0.035144 -0.338481 1.000000 -0.369226 0.083081 0.018443 -0.549500 -0.131900
Age 0.036847 -0.077221 -0.369226 1.000000 -0.308247 -0.189119 0.096067 -0.093254
SibSp -0.057527 -0.035322 0.083081 -0.308247 1.000000 0.414838 0.159651 0.114631
Parch -0.001652 0.081629 0.018443 -0.189119 0.414838 1.000000 0.216225 0.245489
Fare 0.012658 0.257307 -0.549500 0.096067 0.159651 0.216225 1.000000 0.182333
IsFemale -0.042939 0.543351 -0.131900 -0.093254 0.114631 0.245489 0.182333 1.000000

Visual Data Exploration with Matplotlib


In [43]:
data1 = np.random.normal(0, 0.1, 1000)
data2 = np.random.normal(1, 0.4, 1000) + np.linspace(0, 1, 1000)
data3 = 2 + np.random.random(1000) * np.linspace(1, 5, 1000)
data4 = np.random.normal(3, 0.2, 1000) + 0.3 * np.sin(np.linspace(0, 20, 1000))

In [44]:
# Stacking and transposing is equal to stacking columns
data = np.vstack([data1, data2, data3, data4]).transpose()

In [45]:
df = pd.DataFrame(data, columns = ['data1', 'data2', 'data3', 'data4'])
df.head()


Out[45]:
data1 data2 data3 data4
0 0.170847 0.605400 2.432583 3.103751
1 0.090003 0.583349 2.397237 2.815148
2 0.158135 1.094243 2.706464 2.989091
3 -0.056954 0.054362 2.182445 2.953376
4 0.010518 1.100732 2.308244 3.072519

Line Plot


In [46]:
df.plot(title='Line plot', figsize = (7, 7),)


Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f61512e2588>

In [47]:
# Adding title and legend
plt.plot(df)
plt.title('Line plot')
plt.legend(['data1', 'data2', 'data3', 'data4'])


Out[47]:
<matplotlib.legend.Legend at 0x7f614c207e10>

Scatter Plot


In [48]:
df.plot(style = '.', 
       figsize = (7, 7),)


Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f614c1d36a0>

In [49]:
df.plot(kind = 'scatter',
        figsize = (7, 7),
        x = 'data1',
        y = 'data2',
        xlim = (-1.5, 1.5), 
        ylim = (0, 3))


Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f614c15c978>

Histograms


In [50]:
df.plot(kind = 'hist',
        figsize = (7, 7),
        bins = 50,
        title = 'Histogram',
        alpha = 0.6)


Out[50]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f614c0debe0>

Cumulative distribution


In [51]:
df.plot(kind = 'hist',
        figsize = (7, 7),
        bins = 100,
        title = 'Cumulative distributions',
        normed = True,
        cumulative = True,
        alpha = 0.4)


Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6147fc1c18>

Box Plot


In [52]:
df.plot(kind = 'box',
        figsize = (7, 7),
        title = 'Boxplot')


Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6147cf0e80>

Subplots


In [53]:
fig, ax = plt.subplots(2, 2, figsize=(10, 10))

df.plot(ax=ax[0][0],
        title='Line plot')

df.plot(ax=ax[0][1],
        style='o',
        title='Scatter plot')

df.plot(ax=ax[1][0],
        kind='hist',
        bins=50,
        title='Histogram')

df.plot(ax=ax[1][1],
        kind='box',
        title='Boxplot')

plt.tight_layout()


Pie charts


In [54]:
gt01 = df['data1'] > 0.1
piecounts = gt01.value_counts()
piecounts


Out[54]:
False    851
True     149
Name: data1, dtype: int64

In [55]:
# Plotting a pie chart
# explode : how far each piece of the pie is far from the centre
# autopct : formatting printed percentage
piecounts.plot(kind = 'pie',
               figsize = (7, 7),
               explode = [0.0, 0.15],
               labels = ['<= 0.1', '> 0.1'],
               #colors = ['#191970', '#001CF0'],
               autopct = '%1.2f%%',
               shadow = True,
               startangle = 90,
               fontsize = 16)
plt.legend(loc = "best")


Out[55]:
<matplotlib.legend.Legend at 0x7f6147a56ef0>

Hexbin plot


In [56]:
data = np.vstack([np.random.normal((0, 0), 2, size = (1000, 2)),
                  np.random.normal((9, 9), 3, size = (2000, 2))])
df = pd.DataFrame(data, columns = ['x', 'y'])

In [57]:
df.head()


Out[57]:
x y
0 -1.389318 -3.243604
1 4.715768 -2.839096
2 2.089588 0.514746
3 -2.882980 0.340440
4 1.842628 3.096558

In [58]:
df.plot()


Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6147383668>

In [59]:
df.plot(kind = 'kde')


Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6146d83198>

In [60]:
df.plot(kind='hexbin', 
        x = 'x', 
        y = 'y', 
        bins = 100, 
        cmap = 'rainbow')


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f61471b1c88>

Unstructured data

Images


In [61]:
from PIL import Image

In [62]:
img = Image.open('./data/iss.jpg')
img


Out[62]:

In [63]:
type(img)


Out[63]:
PIL.JpegImagePlugin.JpegImageFile

In [64]:
imgarray = np.asarray(img)

In [65]:
type(imgarray)


Out[65]:
numpy.ndarray

In [66]:
# 3 channels, each 425 by 640 pixels
imgarray.shape


Out[66]:
(435, 640, 3)

In [67]:
imgarray.ravel().shape


Out[67]:
(835200,)

In [68]:
435 * 640 * 3


Out[68]:
835200

Sound


In [69]:
from scipy.io import wavfile

In [70]:
rate, sound = wavfile.read(filename='./data/sms.wav')

In [71]:
from IPython.display import Audio

In [72]:
Audio(data = sound, rate = rate)


Out[72]:

In [73]:
Audio(data = sound, rate = 0.5 * rate)


Out[73]:

In [74]:
len(sound)


Out[74]:
110250

In [75]:
sound


Out[75]:
array([70, 14, 27, ..., 58, 68, 59], dtype=int16)

In [76]:
plt.plot(sound)


Out[76]:
[<matplotlib.lines.Line2D at 0x7f613a3666a0>]

In [77]:
plt.specgram(sound, NFFT=1024, Fs=44100)
plt.ylabel('Frequency (Hz)')
plt.xlabel('Time (s)')


Out[77]:
<matplotlib.text.Text at 0x7f613b03dcc0>

Data Exploration Exercises

Exercise 1

  • load the dataset: ../data/international-airline-passengers.csv
  • inspect it using the .info() and .head() commands
  • use the function pd.to_datetime() to change the column type of 'Month' to a datatime type
  • set the index of df to be a datetime index using the column 'Month' and the df.set_index() method
  • choose the appropriate plot and display the data
  • choose appropriate scale
  • label the axes

In [78]:
ex1 = pd.read_csv('./data/international-airline-passengers.csv')

In [79]:
ex1.head()


Out[79]:
Month Thousand Passengers
0 1949-01 112
1 1949-02 118
2 1949-03 132
3 1949-04 129
4 1949-05 121

In [80]:
ex1.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 2 columns):
Month                  144 non-null object
Thousand Passengers    144 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.3+ KB

In [81]:
ex1['Month'] = pd.to_datetime(ex1['Month'])

In [82]:
ex1 = ex1.set_index('Month')
ex1.head()


Out[82]:
Thousand Passengers
Month
1949-01-01 112
1949-02-01 118
1949-03-01 132
1949-04-01 129
1949-05-01 121

In [83]:
ex1.plot()


Out[83]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f613a815898>

Exercise 2

  • load the dataset: ../data/weight-height.csv
  • inspect it
  • plot it using a scatter plot with Weight as a function of Height
  • plot the male and female populations with 2 different colors on a new scatter plot
  • remember to label the axes

In [84]:
ex2 = pd.read_csv('./data/weight-height.csv')

In [85]:
ex2.head()


Out[85]:
Gender Height Weight
0 Male 73.847017 241.893563
1 Male 68.781904 162.310473
2 Male 74.110105 212.740856
3 Male 71.730978 220.042470
4 Male 69.881796 206.349801

In [86]:
ex2.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
Gender    10000 non-null object
Height    10000 non-null float64
Weight    10000 non-null float64
dtypes: float64(2), object(1)
memory usage: 234.5+ KB

In [87]:
ex2.describe()


Out[87]:
Height Weight
count 10000.000000 10000.000000
mean 66.367560 161.440357
std 3.847528 32.108439
min 54.263133 64.700127
25% 63.505620 135.818051
50% 66.318070 161.212928
75% 69.174262 187.169525
max 78.998742 269.989699

In [88]:
ex2.plot(kind = 'scatter',
         x = 'Height',
         y = 'Weight')


Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f613a574438>

In [89]:
ex2_males = ex2[ex2['Gender'] == 'Male']
ex2_females = ex2[ex2['Gender'] == 'Female']

In [90]:
fig, ax = plt.subplots(figsize = (10, 10))
ex2_males.plot(kind = 'scatter',
               x = 'Height',
               y = 'Weight', 
               ax = ax, 
               color = 'blue', 
               alpha = 0.2)
ex2_females.plot(kind = 'scatter',
               x = 'Height',
               y = 'Weight', 
               ax = ax, 
               color = 'red',
               alpha = 0.2)
plt.title('Male')


Out[90]:
<matplotlib.text.Text at 0x7f613a40e8d0>

Exercise 3

  • plot the histogram of the heights for males and for females on the same plot
  • use alpha to control transparency in the plot comand
  • plot a vertical line at the mean of each population using plt.axvline()

In [91]:
ex3_males = ex2_males
ex3_females = ex2_females
fig, ax = plt.subplots(figsize = (10, 10))
ex3_males['Height'].plot(kind = 'hist',
                         bins = 30, 
                         color = 'blue', 
                         alpha = 0.3)
ex3_females['Height'].plot(kind = 'hist',
                           bins = 30, 
                           color = 'red',
                           alpha = 0.3)
plt.axvline(ex3_males['Height'].mean(), 
            linewidth = 3,
            linestyle='dashed',
            color = 'blue')
plt.axvline(ex3_females['Height'].mean(),
            linewidth = 3,
            linestyle='dashed',
            color = 'red')
plt.legend()


Out[91]:
<matplotlib.legend.Legend at 0x7f613a3454e0>

Exercise 4

  • plot the weights of the males and females using a box plot
  • which one is easier to read?
  • (remember to put in titles, axes and legends)

In [92]:
ex4 = ex2
ex4.head()


Out[92]:
Gender Height Weight
0 Male 73.847017 241.893563
1 Male 68.781904 162.310473
2 Male 74.110105 212.740856
3 Male 71.730978 220.042470
4 Male 69.881796 206.349801

In [93]:
ex4_pivot = ex4.pivot(columns = 'Gender',
                      values = 'Weight')

In [94]:
ex4_pivot.head()


Out[94]:
Gender Female Male
0 NaN 241.893563
1 NaN 162.310473
2 NaN 212.740856
3 NaN 220.042470
4 NaN 206.349801

In [95]:
ex4_pivot.tail()


Out[95]:
Gender Female Male
9995 136.777454 NaN
9996 170.867906 NaN
9997 128.475319 NaN
9998 163.852461 NaN
9999 113.649103 NaN

In [96]:
ex4_pivot.plot(figsize = (10, 10),
               kind = 'box',
               yticks = [120, 150, 200, 250])
plt.grid()
plt.show()


Exercise 5


In [97]:
ex5 = pd.read_csv('./data/titanic-train.csv')
ex5.head()


Out[97]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [98]:
# Dropping passenger Id
ex5 = ex5.drop('PassengerId', axis = 1)

In [99]:
from pandas.plotting import scatter_matrix

In [100]:
_ = scatter_matrix(ex5, figsize = (15, 15))


Exploring more info about the data


In [101]:
ex5['Age'].plot.kde(figsize = (7, 7))


Out[101]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6139cc1d30>

In [102]:
ex5['Fare'].plot.kde(figsize = (7, 7))


Out[102]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6139bcb780>

In [103]:
ex5['SibSp'].plot.kde(figsize = (7, 7))


Out[103]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6139cd02e8>